Visualizing and Comparing Four Measures of Centrality for 1YOK Protein Structure Network¶


In [1]:
class CentralityAnalysis:
    def __init__(self):
        super().__init__()
        # Initialize graph variable
        self.G = None
        
    def generate_graph(self):
        # Read data and create graph
        df = pd.read_csv('1YOK.cif_ringEdges', sep='\t')
        subset_df = df.loc[(df['NodeId1'].str.contains('A:')) & (df['NodeId2'].str.contains('A:'))]
        self.G = nx.from_pandas_edgelist(subset_df, 'NodeId1', 'NodeId2', create_using=nx.Graph())
        
    def generate_eigenVals(self):
        # Check if graph is generated
        if self.G is None:
            raise ValueError("Graph not generated. Call generate_graph() first.")
        
        eigen_centr = nx.eigenvector_centrality_numpy(self.G)
        eigendf = pd.DataFrame(list(eigen_centr.items()), columns=['node', 'value'])
        eigendf['norm_val'] = eigendf['value'] / eigendf['value'].max()
        return eigendf
        
    def generate_closeVals(self):
        if self.G is None:
            raise ValueError("Graph not generated. Call generate_graph() first.")
        
        close_centr = nx.closeness_centrality(self.G)
        closedf = pd.DataFrame(list(close_centr.items()), columns=['node', 'value'])
        closedf['norm_val'] = closedf['value'] / closedf['value'].max()
        return closedf
        
    def generate_degrVals(self):
        if self.G is None:
            raise ValueError("Graph not generated. Call generate_graph() first.")
        
        degr_centr = nx.degree_centrality(self.G)
        degrdf = pd.DataFrame(list(degr_centr.items()), columns=['node', 'value'])
        degrdf['norm_val'] = degrdf['value'] / degrdf['value'].max()
        return degrdf
        
    def generate_betwVals(self):
        if self.G is None:
            raise ValueError("Graph not generated. Call generate_graph() first.")
        
        betw_centr = nx.betweenness_centrality(self.G)
        betwdf = pd.DataFrame(list(betw_centr.items()), columns=['node', 'value'])
        betwdf['norm_val'] = betwdf['value'] / betwdf['value'].max()
        return betwdf
            
    def generate_edgeBetw(self):
        if self.G is None:
            raise ValueError("Graph not generated. Call generate_graph() first.")
        
        edge_betw = nx.edge_betweenness_centrality(self.G)
        edge_betwdf = pd.DataFrame(list(edge_betw.items()), columns=['Pair', 'Edge Value'])
        edge_betwdf[['Node1', 'Node2']] = pd.DataFrame(edge_betwdf['Pair'].tolist(), index=edge_betwdf.index)
        edge_betwdf = edge_betwdf[['Node1', 'Node2', 'Edge Value']]
        edge_betwdf['Norm Value'] = edge_betwdf['Edge Value'] / edge_betwdf['Edge Value'].max()
        return edge_betwdf
In [3]:
import pandas as pd
import numpy as np
import networkx as nx
import plotly.graph_objects as go

datasets = ['eigenVals', 'betwVals', 'degrVals', 'closeVals']
dataframes = {}

newGraph = CentralityAnalysis()
newGraph.generate_graph()

for dataset in datasets:
    df = getattr(newGraph, f'generate_{dataset}')()
    df['node'] = df['node'].str.replace(r'\D', '', regex=True)
    df['node'] = pd.to_numeric(df['node'])
    df = df.sort_values(by='node', ascending=True)
    df = df.drop(df.index[-1])
    dataframes[dataset] = df

# Calculate the global y-axis range across all datasets
min_val = min(df['norm_val'].min() for df in dataframes.values())
max_val = max(df['norm_val'].max() for df in dataframes.values())

# Create a single plot
fig = go.Figure()

# Define traces for each dataset
for df, name, color in zip(dataframes.values(), ['Eigenvals', 'Betwvals', 'Degrvals', 'Closevals'], ['#32CD32', '#FF5733', '#0000FF', '#FFA500']):
    fig.add_trace(
        go.Scatter(
            x=df['node'],
            y=df['norm_val'],
            mode='lines',
            name=name,
            marker=dict(color=color),
            visible=True,  # Start traces as visible
            customdata=list(zip(dataframes['eigenVals']['norm_val'], dataframes['betwVals']['norm_val'], dataframes['degrVals']['norm_val'], dataframes['closeVals']['norm_val'])),
            hovertemplate='<b>Node:</b> %{x}<br>' +
                          '<b>Eigenvals:</b> %{customdata[0]}<br>' +
                          '<b>Betwvals:</b> %{customdata[1]}<br>' +
                          '<b>Degrvals:</b> %{customdata[2]}<br>' +
                          '<b>Closevals:</b> %{customdata[3]}<extra></extra>'
        )
    )

# Set the same y-axis range for all subplots
fig.update_yaxes(range=[min_val, max_val])

# Update layout to allow toggling visibility via legend
fig.update_layout(legend=dict(itemclick='toggle'))

# Define a callback function to update visibility based on legend clicks
def update_visibility(trace, visible, legend_item):
    all_traces_visible = all(data.visible for data in fig.data)
    if not visible and all_traces_visible:
        return
    elif all_traces_visible:
        for data in fig.data:
            data.visible = False
    else:
        trace.visible = visible

# Attach callback function to legend clicks
for trace in fig.data:
    trace.on_click(update_visibility)

# Update layout to ggplot2 style
fig.update_layout(
    title='Comparing Centrality Metrics Across Residues',
    title_font=dict(family='Arial', size=24, color='black'),  # Modify title font
    xaxis=dict(title='Residue Position', title_font=dict(family='Arial', size=16, color='black')),  # Modify x-axis font
    yaxis=dict(title='Centrality Value', title_font=dict(family='Arial', size=16, color='black')),  # Modify y-axis font
    template='ggplot2'  # Apply ggplot2 style
)

# Display the plot
fig.show()
In [2]:
import plotly.subplots as sp
from scipy.stats import linregress

# Define function to create the 2x2 plot
def create_2x2_plot():
    fig = sp.make_subplots(rows=2, cols=2, subplot_titles=('Betweenness', 'Degree', 'Closeness', 'Eigenvector'), shared_xaxes=False, shared_yaxes=False)

    # Define the x-axis data (Eigenvector centrality)
    x_values = norm_vals['Eigenvector centrality values']

    # Define colors for each centrality
    centrality_colors = {'Betweenness': 'salmon', 'Degree': 'blue', 'Closeness': 'green', 'Eigenvector': 'purple'}

    # Define lists to store r and r-squared values for each centrality
    r_values = []
    r_squared_values = []

    # Betweenness centrality
    slope, intercept, r_value, p_value, std_err = linregress(x_values, norm_vals['Betweenness centrality values'])
    regression_line = slope * np.array(x_values) + intercept
    r_values.append(r_value)
    r_squared_values.append(r_value ** 2)
    fig.add_trace(go.Scatter(x=x_values, y=norm_vals['Betweenness centrality values'], mode='markers', name='Betweenness', marker=dict(color=centrality_colors['Betweenness'])), row=1, col=1)
    fig.add_trace(go.Scatter(x=x_values, y=regression_line, mode='lines', showlegend=False, line=dict(color='olive')), row=1, col=1)

    # Degree centrality
    slope, intercept, r_value, p_value, std_err = linregress(x_values, norm_vals['Degree centrality values'])
    regression_line = slope * np.array(x_values) + intercept
    r_values.append(r_value)
    r_squared_values.append(r_value ** 2)
    fig.add_trace(go.Scatter(x=x_values, y=norm_vals['Degree centrality values'], mode='markers', name='Degree', marker=dict(color=centrality_colors['Degree'])), row=1, col=2)
    fig.add_trace(go.Scatter(x=x_values, y=regression_line, mode='lines', showlegend=False, line=dict(color='olive')), row=1, col=2)

    # Closeness centrality
    slope, intercept, r_value, p_value, std_err = linregress(x_values, norm_vals['Closeness centrality values'])
    regression_line = slope * np.array(x_values) + intercept
    r_values.append(r_value)
    r_squared_values.append(r_value ** 2)
    fig.add_trace(go.Scatter(x=x_values, y=norm_vals['Closeness centrality values'], mode='markers', name='Closeness', marker=dict(color=centrality_colors['Closeness'])), row=2, col=1)
    fig.add_trace(go.Scatter(x=x_values, y=regression_line, mode='lines', showlegend=False, line=dict(color='olive')), row=2, col=1)

    # Eigenvector centrality (dots)
    r_values.append(1)  # R-value for straight line (Eigenvector centrality)
    r_squared_values.append(1)  # R-squared for straight line (Eigenvector centrality)
    fig.add_trace(go.Scatter(x=x_values, y=x_values, mode='markers', name='Eigenvector', marker=dict(color=centrality_colors['Eigenvector'])), row=2, col=2)

    # Update subplot titles with R and R-squared values
    for i, (title, r_value, r_squared) in enumerate(zip(fig.layout.annotations, r_values, r_squared_values)):
        title.text += f'<br>R: {r_value:.2f}, R-squared: {r_squared:.2f}'

    # Update layout
    fig.update_layout(template='ggplot2',  # Set theme to ggplot2-like
                      font=dict(family="Arial", size=12, color="black"),
                      legend=dict(x=0, y=-0.30, traceorder="normal", itemsizing='constant'),  # Adjust itemsizing to avoid resizing
                      title_text="Centrality Values vs. Eigenvector Centrality",
                      width=1000, height=800,
                      xaxis=dict(title=dict(text='Eigenvector Centrality', standoff=335),  # Set x-axis label
                                 tickfont=dict(size=12)),  # Set x-axis tick font size
                      yaxis=dict(title=dict(text='Centrality Values', standoff=0),  # Set y-axis label
                                 tickfont=dict(size=12)))  # Set y-axis tick font size
    fig.show()

# Call the function to create the 2x2 plot
create_2x2_plot()
In [ ]: